In this notebook we are going to generate summary statistics for our data.
In [ ]:
# Import the libraries we need
import pandas as pd
In [ ]:
# Import the dataset from the CSV file
accidents_data_file = '/Users/robert.dempsey/Dropbox/Private/Art of Skill Hacking/' \
'Books/Python Business Intelligence Cookbook/Data/Stats19-Data1979-2004/Accidents7904.csv'
accidents = pd.read_csv(accidents_data_file,
sep=',',
header=0,
index_col=False,
parse_dates=['Date'],
dayfirst=True,
tupleize_cols=False,
error_bad_lines=True,
warn_bad_lines=True,
skip_blank_lines=True,
low_memory=False
)
accidents.head()
In [ ]:
# Use the describe function to generate summary stats for the entire dataset
accidents.describe()
In [ ]:
# Transpose the results provided by describe()
accidents.describe().transpose()
In [ ]:
# By default describe() restricts the stats to numerical or categorical columns. Use the following to include object columns
accidents.describe(include=['object'])
In [ ]:
# Show the mode of each column and transpose it so we can read everything in iPython Notebook
accidents.mode().transpose()
In [ ]:
accidents['Weather_Conditions'].describe()
In [ ]:
# Get the count of each unique value in the Date column.
pd.value_counts(accidents['Date'])
In [ ]:
# Get the count of each unique value in the Date column.
print("Min Value: {}".format(accidents['Number_of_Vehicles'].min()))
print("Max Value: {}".format(accidents['Number_of_Vehicles'].max()))
In [ ]:
accidents['Number_of_Vehicles'].quantile([.05, .1, .25, .5, .75, .9, .99])
In [ ]:
# Mean: the average
# Median: the middle value
# Mode: the value that occurs most often
# Range: the difference between the minimum and maximum values
print("Mean: {}".format(accidents['Number_of_Vehicles'].mean()))
print("Median: {}".format(accidents['Number_of_Vehicles'].median()))
print("Mode: {}".format(accidents['Number_of_Vehicles'].mode()))
print("Range: {}".format(
range(accidents['Number_of_Vehicles'].min(),
accidents['Number_of_Vehicles'].max()
)
))
In [ ]: